getwd()
[1] "/Users/manasa/Documents/Work/TTT/02_Proteomics/02_First-TMT-data"
Now that we have loaded all the packages we need for working with this data, let’s move on to the data.
# -----------------------------------------------
# Step 00: Read data
# Read in all the data required for analysis
# -----------------------------------------------
# File of contaminants - proteins to exclude from analysis as are things like keratin, alcohol dehydrogenase etc....
contam = read.delim("Input/Common contaminant_all.csv",sep=",",header=T)
# Read in the sample file that matches columns to sample contents
samp.dat = read.delim("Input/samples.txt",sep="\t",header=T)
# Read in the data files that contain peptide level output from Proteome discoverer...
# Note: The columns that begin with "Found.in.Sample.in" correspond to various samples in the study.
# Columns of interest are "sequence", "modifications","master.protein.accessions","abundance","quan.info"
data = read.delim("Input/Dosages_4step-Trizol_PeptideGroups.txt",sep="\t",comment.char="",as.is=T,header=T)
# Subset data to only keep columns of interest
prot.data = data[,c(1:6,10:14,17,42:51,62)]
# Rename tmt tagged columns with UV dosage names
for(i in 1:nrow(samp.dat)){
id = grep(samp.dat$TMT[i],colnames(prot.data))
colnames(prot.data)[id] = paste(samp.dat$Sample[i])
}
dim(prot.data)
[1] 14456 23
head(prot.data)
data has 23 columns and 14456 rows - each row belonging to a peptide abundance value across all 10 samples. We now go through a series of filtering steps to obtain a dataset we can use for downstream analyses.
# ---------------------------------------------------------------------------------
# Step 01 : Filter
# We perform 3 layers of filtering - unique proteins, contaminants,missing values
# ---------------------------------------------------------------------------------
# Step 1a : Filter only for those peptides that have a unique master protein. Done using column "quan.info" and titled 'Unique'
peptide.stats = table(prot.data$Quan.Info)
peptide.stats
NoQuanValues NotUnique Unique
1440 715 12301
filt.1a = prot.data[which(prot.data$Quan.Info == "Unique"),]
dim(filt.1a) #12301 are unique peptides, 715 are non-unique and 1440 are missing values
[1] 12301 23
# Step 1b : Filter out those proteins that are contaminants from the contaminants list and annotate missing values
filt.1b = filt.1a[-which(filt.1a$Master.Protein.Accessions %in% contam$Protein.Group.Accessions),]
num.contams = length(which(filt.1a$Master.Protein.Accessions %in% contam$Protein.Group.Accessions))
dim(filt.1a) # 12301 in total
[1] 12301 23
dim(filt.1b) # 12077 filtered proteins
[1] 12077 23
print(num.contams) # 224 contaminant proteins
[1] 224
# Adding extra information about rows with missing values
filt.1b$count.missing = rowSums(is.na(filt.1b[,c(13:22)]))
filt.1b$Missing = FALSE
filt.1b$Missing[which(filt.1b$count.missing > 0)] = TRUE
head(filt.1b)
We have a column called “Missing” to identify which peptides have one or more missing values across the 10 samples. “count.missing” tells us how many missing values there are for that peptide.
# ---------------------------------------------------------------------------------
# Step 02a : Creating an MSnSet which is needed for using the MSnbase backage
# ---------------------------------------------------------------------------------
# The rownames of samp.dat have to be the same as column names in the expression data matrix
rownames(samp.dat) = samp.dat$Sample
# Create an MSnSet object
res <- MSnSet(exprs = as.matrix(filt.1b[,c(13:22)]),fData=filt.1b[,c(10,1:9,12,23:25)],pData = samp.dat[,c(2,1)])
res <- res[rowSums(is.na(exprs(res)))!=10,] # exclude peptides without any quantification
print(res)
MSnSet (storageMode: lockedEnvironment)
assayData: 12077 features, 10 samples
element names: exprs
protocolData: none
phenoData
sampleNames: 150mJ.1 150mJ.2 ... Pool.1 (10 total)
varLabels: TMT Sample
varMetadata: labelDescription
featureData
featureNames: 2 3 ... 14455 (12077 total)
fvarLabels: Master.Protein.Accessions Checked ... Missing (14 total)
fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:
- - - Processing information - - -
Subset [12077,10][12077,10] Tue Aug 29 10:00:37 2017
MSnbase version: 2.0.2
# How many missing values per peptide
table(fData(res)$count.missing)
0 1 2 3 4 5 6 7 8 9
11451 551 38 16 8 4 2 4 1 2
colSums(is.na(exprs(res)))
150mJ.1 150mJ.2 150mJ.3 275mJ.1 275mJ.2 275mJ.3 400mJ.1 400mJ.2 400mJ.3 Pool.1
22 11 612 49 6 17 6 33 6 31
table(rowSums(is.na(exprs(res))))
0 1 2 3 4 5 6 7 8 9
11451 551 38 16 8 4 2 4 1 2
# Checking missing values
table(is.na(res))
FALSE TRUE
119977 793
The MSnSet object has been created to include protein abundance values, some metadata and sample information (UV dosage in this case)
# ---------------------------------------------------------------------------------
# Step 02a : Imputing missing values in the protein expression data using 'impute'
# ---------------------------------------------------------------------------------
# Subsetting only those peptides with one or more missing values
# Replacing missing values with 0 and non-missing with 1
# Displaying missing values
miss.many = res[rowSums(is.na(exprs(res)))>=1,]
exprs(miss.many)[exprs(miss.many) != 0] = 1
exprs(miss.many)[is.na(exprs(miss.many))] = 0
heatmap.2(exprs(miss.many), col = c("lightgray", "black"),
scale = "none", dendrogram = "none",
trace = "none", keysize = 0.5, key = FALSE,Colv=F,
ColSideColors = rep(c("steelblue", "darkolivegreen","magenta","black"), times = c(3,3,3,1)))
# Impute missing values
impute.res <- impute(res,method = "knn")
9 rows with more than 50 % entries missing;
mean imputation used for these rows
Cluster size 12068 broken into 11832 236
Cluster size 11832 broken into 8982 2850
Cluster size 8982 broken into 3383 5599
Cluster size 3383 broken into 1164 2219
Done cluster 1164
Cluster size 2219 broken into 1271 948
Done cluster 1271
Done cluster 948
Done cluster 2219
Done cluster 3383
Cluster size 5599 broken into 3042 2557
Cluster size 3042 broken into 1360 1682
Done cluster 1360
Cluster size 1682 broken into 631 1051
Done cluster 631
Done cluster 1051
Done cluster 1682
Done cluster 3042
Cluster size 2557 broken into 1453 1104
Done cluster 1453
Done cluster 1104
Done cluster 2557
Done cluster 5599
Done cluster 8982
Cluster size 2850 broken into 725 2125
Done cluster 725
Cluster size 2125 broken into 1238 887
Done cluster 1238
Done cluster 887
Done cluster 2125
Done cluster 2850
Done cluster 11832
Done cluster 236
pData(impute.res)$Sample = gsub('\\s+','',pData(impute.res)$Sample)
# Plot imputed values
res.miss = melt(exprs(res))
colnames(res.miss) = c("Row","Dosage","Abundance_imp")
res.no.miss = melt(exprs(impute.res))
colnames(res.no.miss) = c("Row","Dosage","Abundance")
imp.vals = res.no.miss[which(is.na(res.miss$Abundance_imp)),]
# Some boxplots of the data
#---------------------------
# All data including missing values
boxplot(log2(res.miss$Abundance)~as.factor(res.miss$Dosage),las=2,col=rep(c("turquoise", "salmon","palegreen","plum1"),times=c(3,3,3,1)),main="All data including missing values")
# Imputed values/missing values only
b.imp = boxplot(log2(imp.vals$Abundance)~imp.vals$Dosage,las=2,col=rep(c("turquoise", "salmon","palegreen","plum1"),times=c(3,3,3,1)),main="Imputed values only")
boxplot(log2(imp.vals$Abundance)~imp.vals$Dosage,las=2,names = paste(b.imp$names," (n=", b.imp$n, ")",sep=""),col=rep(c("turquoise", "salmon","palegreen","plum1"),times=c(3,3,3,1)),main="Imputed values only",,cex.axis = 0.6)
# All data icluding newly imputed values
boxplot(log2(res.no.miss$Abundance)~as.factor(res.no.miss$Dosage),las=2,col=rep(c("turquoise", "salmon","palegreen","plum1"),times=c(3,3,3,1)),main="All data with imputed values")
# Additional plots showing fraction of missing values
# Not much use as in our case, it is very small.
vim.dat = data.frame(cbind(Abundance_imp=res.miss$Abundance_imp,Abundance=res.no.miss$Abundance),stringsAsFactors = F)
head(vim.dat)
vim.dat$which.miss = FALSE
vim.dat$which.miss[which(is.na(vim.dat$Abundance_imp))] = TRUE
table(vim.dat$which.miss)
FALSE TRUE
119977 793
histMiss(vim.dat, only.miss = F)
Click in in the left margin to switch to the previous variable or in the right margin to switch to the next variable.
To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
pbox(vim.dat, ylim=c(0,500),selection="none")
Click in in the left margin to switch to the previous variable or in the right margin to switch to the next variable.
To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
matrixplot(vim.dat)
Click in a column to sort by the corresponding variable.
To regain use of the VIM GUI and the R console, click outside the plot region.
We have a small number of missing values - 793 peptides have one or more missing values out of 12244 peptides in total. 551/793 are missing in 1 sample only and 38/793 in 2 samples. Only 2 peptides are missing in 9/10 samples.
# ---------------------------------------------------------------------------------
# Step 03 : Normalising imputed data using various methods to determine ideal one
# ---------------------------------------------------------------------------------
qnt.max <- normalise(impute.res, "max")
qnt.sum <- normalise(impute.res, "sum")
qnt.quant <- normalise(impute.res, "quantiles")
qnt.qrob <- normalise(impute.res, "quantiles.robust")
qnt.vsn <- normalise(impute.res, "vsn")
vsn2: 12077 x 10 matrix (1 stratum). Please use 'meanSdPlot' to verify the fit.
## ---- plotting function---------
.plot <- function(x,ttl=NULL) {
boxplot(exprs(x),
main=ifelse(is.null(ttl),processingData(x)@processing[2],ttl),
cex.main=1.5,
cex.lab=.5,
cex.axis=0.8,
cex=.8,
las=2)
grid()
}
# Using the plotting function to plot boxplots for all diff types of normalisation methods
oldmar <- par()$mar
par(mfrow=c(3,2),mar=c(2.9,2.9,2.9,1))
.plot(impute.res, ttl = "Non-normalised data")
.plot(qnt.max, ttl = "Maximum")
.plot(qnt.sum, ttl = "Sum")
.plot(qnt.quant, ttl = "Quantile")
.plot(qnt.qrob, ttl = "Robust quantile")
.plot(qnt.vsn, ttl = "vsn")
# Checking the effects of normalisation on covariance
sd1 <- apply(log2(exprs(impute.res))+10,1,sd)
mn1 <- apply(log2(exprs(impute.res))+10,1,mean)
cv1 <- sd1/mn1
sd2 <- apply(exprs(qnt.vsn)+10,1,sd)
mn2 <- apply(exprs(qnt.vsn)+10,1,mean)
cv2 <- sd2/mn2
dfr <- rbind(data.frame(rank=order(mn1),cv=cv1,norm="raw"),
data.frame(rank=order(mn2),cv=cv2,norm="vsn"))
rmed1 <- rollapply(cv1,7,function(x) median(x,na.rm=TRUE))
rmed2 <- rollapply(cv2,7,function(x) median(x,na.rm=TRUE))
dfr2 <- rbind(data.frame(x=seq(0,30,by=30/length(rmed1))[-1],y=rmed1,norm="raw"),data.frame(x=seq(0,30,by=30/length(rmed2))[-1],y=rmed2,norm="vsn"))
p <- ggplot()+geom_line(data=dfr2,aes(x=x,y=y,col=norm)) + theme_gray(7)
plot(p)
Will keep the data from the vsn normalisation for downstream analyses as it normalises the data better than other methods used in this comparison such as “sum”, “max”, “quantile”.
For all further steps, we will use the object “qnt.vsn”
# ---------------------------------------------------------------------------------
# Step 04a : Aggregate peptide data to protein expression values
# There is an in-built function called 'combineFeatures' to do thi within MSnBase
# ---------------------------------------------------------------------------------
protnames <- fData(qnt.vsn)$Master.Protein.Accessions
table(protnames)
protnames
A0A024QZP7 A0A024R216 A0A024R4E5 A0A024R644 A0A075B716 A0A087WSV8 A0A087WTW0 A0A087WUL9 A0A087WUT6 A0A087WUZ3 A0A087WV75 A0A087WVC6 A0A087WVP1 A0A087WVQ6 A0A087WW66
1 5 30 2 2 3 2 3 42 48 6 3 4 38 9
A0A087WX22 A0A087WXF8 A0A087WY31 A0A087WYL5 A0A087WYN9 A0A087WZ30 A0A087WZT3 A0A087X0H9 A0A087X117 A0A087X1R1 A0A087X1U6 A0A087X1Z3 A0A087X211 A0A087X2G6 A0A087X2H1
2 2 3 2 2 6 2 3 2 8 1 2 2 1 1
A0A087X2I1 A0A096LNX8 A0A0A0MQS2 A0A0A0MQX8 A0A0A0MR39 A0A0A0MR66 A0A0A0MRM9 A0A0A0MRX1 A0A0A0MSE2 A0A0A0MSW4 A0A0A0MT33 A0A0A0MT49 A0A0A0MT64 A0A0A0MTC1 A0A0A0MTC5
3 5 4 6 8 14 27 7 2 2 3 6 5 2 8
A0A0A0MTH3 A0A0A0MTS2 A0A0A6YYL6 A0A0B4J1V8 A0A0B4J2E5 A0A0C4DFM7 A0A0C4DG17 A0A0C4DG49 A0A0C4DGB5 A0A0C4DGG9 A0A0C4DGH0 A0A0D9SEN2 A0A0D9SF30 A0A0D9SF53 A0A0D9SF98
3 4 5 5 5 2 4 4 3 14 4 11 6 20 1
A0A0G2JIW1 A0A0G2JJL1 A0A0G2JK23 A0A0G2JK44 A0A0G2JNW7 A0A0G2JPP5 A0A0R4J2E2 A0A0U1RQK7 A0A0U1RRH6 A0A0U1RRM6 A0A0X1KG71 A0A140LJL2 A0A140T930 A0A1B0GTU4 A0A1B0GUW5
8 1 3 4 4 13 3 4 3 5 2 2 1 3 1
A0A1B0GW77 A0A1C7CYX9 A0AVT1 A1L020 A1X283 A5YKK6 A6NHR9 A8K878 A8K968 A8MXP9 A8MYA2 A8MYK1 A9Z1X7 B0QY89 B0YIW6
5 8 6 2 2 5 5 3 1 32 5 2 6 6 4
B1AK88 B1ALY0 B3KS98 B4DDD6 B4DUT8 B4DY08 B5MBZ0 B5MCU0 B5ME97 B7Z645 B7Z6Z4 B7ZKJ3 B7ZKQ9 B7ZLQ5 B7ZM99
2 1 2 5 3 22 7 1 1 1 4 1 4 2 5
B8ZZD4 B8ZZN6 C9J6P4 C9JFV4 C9JIF9 C9JIZ6 C9JSZ1 D3DQV9 D6RBZ0 D6RER5 D6REX3 D6W592 E5RJD8 E5RJR5 E7EMB3
3 2 9 2 4 17 1 11 6 2 4 3 3 2 6
E7EPK1 E7EQ64 E7EQR4 E7EQT4 E7EQZ4 E7ERH1 E7ERS3 E7ESC6 E7ETY2 E7EVA0 E7EW49 E7EWR4 E7EX70 E9PAU2 E9PAV3
2 1 9 11 3 1 15 1 1 25 3 5 3 6 3
E9PB61 E9PGC5 E9PGC8 E9PHI4 E9PK25 E9PKU7 E9PLY5 E9PMV1 E9PNQ8 E9PQY2 E9PR17 E9PRY8 F1T0I1 F5GXR6 F5GYJ8
10 2 14 9 5 1 1 1 2 2 3 10 13 3 2
F5H039 F5H2X7 F5H5D3 F6S8M0 F6WQW2 F8VU51 F8VXC8 F8W0W8 F8W6C2 F8W727 F8W930 F8WCT1 G3V0J0 G3V180 G3V1L9
4 2 1 6 4 9 3 1 1 4 15 4 1 2 9
G3V3B0 G3XAI2 G5E9Q6 G5EA03 G5EA30 G8JLB6 H0Y412 H0Y4R2 H0Y704 H0Y8C2 H0Y8C6 H0Y8P4 H0YA96 H0YAB7 H0YBA3
1 10 2 2 9 14 1 6 1 2 3 5 1 1 2
H0YH80 H0YHG0 H0YIQ2 H0YKD8 H0YMD1 H3BLZ8 H3BPE7 H3BQK0 H3BQK9 H3BQM0 H3BR70 H3BUF6 H7BY55 H7BY57 H7BY58
1 2 5 2 7 14 6 4 25 2 1 1 3 5 2
H7BYY1 H7BZJ3 H7C215 H7C2I1 H7C2Q8 H7C4C5 H9KVB4 I3L2B0 I3L4C2 I3L504 J3KMX3 J3KN01 J3KN16 J3KN66 J3KPF3
1 1 2 3 8 1 6 5 3 3 2 5 2 1 11
J3KPP4 J3KPS3 J3KQ69 J3KQE5 J3KQN4 J3KTA4 J3KTL2 J3QK89 J3QQ67 J3QR07 J3QRS3 J3QRU1 J3QS41 J3QSU6 K7EJ20
12 15 6 3 1 24 11 5 2 8 3 3 2 30 2
K7EJ78 K7ELC7 K7ELL7 K7ELX0 K7ES00 M0QYN0 M0QYS1 M0QZM1 M0R0R2 M0R2B7 M0R2Z9 O00116 O00159 O00170 O00192
3 3 5 3 1 2 4 1 3 2 16 4 5 2 2
O00193 O00203 O00231 O00264 O00267 O00299 O00411 O00429 O00443 O00461 O00468 O00469 O00505 O00522 O00541
6 8 2 3 8 7 3 7 2 8 14 5 1 1 7
O00566 O00567 O00592 O00622 O00629 O14497 O14578 O14639 O14646 O14647 O14672 O14686 O14744 O14745 O14773
6 6 4 4 2 4 5 5 13 7 2 2 2 6 2
O14776 O14777 O14786 O14818 O14908 O14964 O14974 O14979 O14980 O15014 O15031 O15042 O15047 O15067 O15213
8 4 6 3 2 6 10 13 6 5 10 16 3 2 1
O15226 O15230 O15231 O15234 O15294 O15347 O15355 O15357 O15371 O15439 O15446 O15479 O15481 O15498 O43143
6 12 4 8 5 2 11 2 9 2 6 7 1 2 10
O43148 O43159 O43175 O43242 O43251 O43290 O43347 O43390 O43491 O43493 O43615 O43707 O43719 O43776 O43795
2 3 7 3 5 11 2 17 9 19 1 10 23 6 4
O43809 O43818 O43823 O43852 O60231 O60264 O60279 O60282 O60306 O60333 O60506 O60507 O60524 O60568 O60664
2 2 11 4 3 5 4 1 2 1 4 2 7 2 7
O60684 O60701 O60716 O60732 O60763 O60828 O60832 O60869 O60885 O75054 O75083 O75116 O75146 O75147 O75152
1 2 11 3 5 2 8 6 3 2 6 5 3 2 9
O75165 O75175 O75179 O75369 O75400 O75475 O75487 O75494 O75525 O75533 O75534 O75569 O75616 O75643 O75683
2 1 3 71 19 10 7 7 2 17 30 2 1 17 4
O75691 O75694 O75717 O75821 O75822 O75874 O75909 O75914 O75937 O75962 O75976 O76003 O76021 O76094 O94776
9 4 2 3 10 2 3 2 2 8 13 3 8 8 5
O94822 O94826 O94903 O94906 O94913 O94925 O94992 O95071 O95104 O95155 O95218 O95239 O95292 O95297 O95302
3 3 3 5 2 3 2 6 2 2 11 12 1 3 6
O95347 O95373 O95425 O95429 O95433 O95490 O95602 O95613 O95678 O95758 O95782 O95793 O95817 O95985 O96013
12 5 5 1 3 3 2 12 1 2 2 8 6 3 4
O96028 P00338 P00387 P00505 P00533 P00558 P00734 P01023 P02452 P02462 P02533 P02538 P02545 P02751 P02765
2 5 2 2 4 11 2 1 30 2 2 2 42 20 2
P02786 P04083 P04156 P04183 P04259 P04406 P04792 P04843 P04899 P04920 P05023 P05026 P05067 P05091 P05141
23 8 3 2 1 21 7 6 2 2 10 12 5 3 2
P05198 P05387 P05455 P05556 P05783 P05787 P06576 P06733 P06748 P06756 P07195 P07237 P07305 P07355 P07437
4 3 21 31 3 7 4 25 10 23 5 27 2 21 2
P07737 P07814 P07900 P07951 P07996 P08069 P08133 P08134 P08183 P08237 P08238 P08572 P08582 P08621 P08648
5 25 11 2 14 7 5 2 5 1 19 3 2 18 8
P08670 P08779 P09012 P09382 P09429 P09496 P09619 P09651 P09874 P09960 P0C0S5 P0C7U0 P0DMU7 P10412 P10586
34 6 3 5 13 4 3 9 27 3 2 10 4 2 21
P10809 P10909 P10915 P11021 P11047 P11117 P11142 P11166 P11216 P11279 P11387 P11388 P11413 P11586 P11717
15 3 2 24 20 2 23 3 4 6 34 9 7 23 27
P11766 P11940 P12004 P12109 P12110 P12111 P12268 P12270 P12814 P12956 P13010 P13473 P13489 P13639 P13647
2 18 3 4 2 6 9 26 14 12 12 3 4 25 16
P13667 P13674 P13797 P14384 P14543 P14618 P14625 P14866 P14868 P15170 P15328 P15529 P15531 P15880 P15924
15 2 2 2 3 14 18 36 8 2 5 2 4 15 9
P16070 P16104 P16403 P16615 P16949 P16989 P17050 P17096 P17252 P17480 P17655 P17812 P17858 P17980 P17987
16 2 3 4 3 12 2 4 2 28 4 2 2 3 7
P18084 P18124 P18206 P18433 P18583 P18615 P18669 P18827 P18858 P19013 P19022 P19174 P19338 P19367 P19525
6 13 21 3 18 2 3 3 3 1 13 3 42 7 6
P19823 P20042 P20073 P20248 P20290 P20645 P20700 P20930 P21333 P21359 P21796 P21980 P22087 P22102 P22234
4 6 3 2 4 7 39 2 96 1 3 4 5 5 6
P22314 P22626 P22695 P23229 P23246 P23284 P23368 P23396 P23470 P23526 P23588 P24534 P24666 P24752 P24928
15 35 2 3 31 7 2 12 2 6 24 4 2 4 11
P25398 P25445 P25685 P25686 P25705 P25786 P25942 P26006 P26038 P26196 P26358 P26368 P26373 P26583 P26599
4 2 2 2 9 2 2 6 8 5 9 7 9 6 13
P26639 P26640 P26641 P27348 P27635 P27694 P27695 P27708 P27797 P27824 P28482 P28799 P28838 P29317 P29323
12 16 8 3 4 8 4 27 7 3 2 2 4 6 3
P29353 P29372 P29401 P29558 P29590 P30040 P30043 P30044 P30048 P30050 P30086 P30101 P30153 P30260 P30414
2 2 9 1 3 2 2 2 3 2 2 21 4 2 10
P30419 P30530 P30533 P30622 P30825 P30876 P31151 P31431 P31483 P31689 P31930 P31939 P31942 P31946 P31947
4 4 3 2 2 8 2 8 2 6 1 3 19 3 4
P31948 P32004 P32119 P32969 P33176 P33991 P33992 P33993 P34741 P34932 P35052 P35221 P35222 P35232 P35241
10 12 4 2 9 7 2 5 3 21 17 7 5 3 4
P35249 P35251 P35268 P35269 P35556 P35568 P35579 P35580 P35606 P35613 P35658 P35659 P35998 P36578 P36915
4 4 7 12 5 3 86 18 3 9 7 13 5 12 3
P37198 P37275 P37802 P38159 P38432 P38606 P38646 P38919 P39019 P39023 P39687 P39748 P40222 P40227 P40261
4 2 7 10 6 4 17 2 7 6 14 3 15 5 2
P40692 P40925 P40926 P40939 P41214 P41227 P41240 P41250 P41252 P41440 P42166 P42167 P42224 P42285 P42696
2 2 4 8 6 2 2 8 10 3 2 1 4 5 2
P42704 P42766 P42785 P42892 P43007 P43121 P43246 P43358 P43686 P45880 P45973 P45974 P46013 P46060 P46063
33 2 4 8 3 27 4 3 2 4 1 9 75 1 5
P46087 P46109 P46459 P46776 P46777 P46778 P46781 P46783 P46821 P46939 P46940 P46976 P47897 P47914 P48634
10 4 2 5 19 2 10 4 16 22 19 2 5 6 35
P48643 P48681 P48960 P49006 P49137 P49189 P49207 P49257 P49321 P49327 P49368 P49411 P49454 P49588 P49589
9 10 5 6 2 3 11 1 21 29 13 9 21 11 10
P49736 P49755 P49756 P49790 P49792 P49915 P50395 P50454 P50479 P50552 P50570 P50750 P50895 P50897 P50914
3 2 22 8 51 8 4 8 3 2 2 1 10 3 4
P50990 P50991 P50993 P50995 P51114 P51116 P51148 P51149 P51153 P51452 P51531 P51610 P51648 P51665 P51858
15 5 1 2 11 5 3 3 2 2 1 6 2 2 4
P51991 P52209 P52272 P52292 P52294 P52306 P52597 P52701 P52732 P52756 P52907 P52948 P53350 P53396 P53618
12 3 24 6 2 1 10 11 8 2 4 4 2 8 2
P53621 P53992 P53999 P54132 P54136 P54289 P54577 P54578 P54709 P54727 P54886 P55011 P55036 P55060 P55072
8 2 7 1 5 8 14 2 3 3 5 8 3 10 23
P55081 P55084 P55209 P55265 P55268 P55290 P55786 P55795 P55884 P56182 P56192 P57721 P57723 P57740 P60174
2 2 2 12 10 5 4 6 6 7 6 1 4 3 9
P60228 P60468 P60709 P60842 P60866 P60903 P61011 P61020 P61026 P61106 P61158 P61221 P61247 P61254 P61313
2 2 8 9 3 2 6 1 1 1 3 3 14 5 2
P61586 P61604 P61927 P61956 P61978 P61981 P62081 P62136 P62191 P62195 P62241 P62249 P62258 P62263 P62266
2 2 2 1 6 3 6 3 2 3 7 2 2 8 2
P62269 P62277 P62280 P62316 P62424 P62633 P62701 P62736 P62750 P62753 P62805 P62820 P62829 P62847 P62851
5 2 10 3 11 7 8 2 7 8 7 1 2 4 5
P62857 P62879 P62888 P62899 P62906 P62913 P62917 P62995 P63010 P63104 P63151 P63167 P63244 P67809 P67936
2 3 3 5 3 3 12 11 4 9 3 1 8 19 3
P68104 P68363 P68366 P68371 P68431 P78310 P78316 P78324 P78332 P78347 P78357 P78371 P78527 P78536 P82675
8 1 2 2 1 2 5 6 11 2 3 4 55 9 4
P82933 P82970 P83731 P84098 P84103 P85037 P98155 P98160 P98179 Q00577 Q00653 Q00688 Q00839 Q01081 Q01085
3 12 7 4 9 3 6 13 6 5 5 3 28 3 8
Q01105 Q01130 Q01433 Q01469 Q01518 Q01581 Q01650 Q01780 Q01804 Q01813 Q01831 Q01844 Q01970 Q02224 Q02241
10 3 7 6 8 6 3 7 4 1 4 5 6 6 8
Q02539 Q02543 Q02790 Q02809 Q02878 Q02880 Q02952 Q03001 Q03188 Q03252 Q03405 Q03701 Q04637 Q04695 Q04721
2 7 8 6 10 5 10 20 1 12 7 8 27 5 4
Q04760 Q04837 Q04917 Q05209 Q05519 Q05639 Q05682 Q06203 Q06210 Q06481 Q06787 Q07065 Q07352 Q07666 Q07954
2 2 1 2 10 2 6 2 6 3 1 24 2 11 43
Q08170 Q08209 Q08211 Q08379 Q08380 Q08554 Q08722 Q08945 Q08AM6 Q08J23 Q09028 Q09161 Q09666 Q10570 Q12765
4 1 21 4 5 3 3 19 1 7 3 2 212 2 3
Q12769 Q12788 Q12789 Q12792 Q12797 Q12830 Q12841 Q12849 Q12874 Q12888 Q12904 Q12905 Q12906 Q12931 Q12959
2 3 3 2 6 12 13 9 8 9 3 4 20 2 3
Q13085 Q13123 Q13136 Q13148 Q13151 Q13155 Q13162 Q13185 Q13200 Q13206
2 4 3 4 9 2 2 4 11 5
[ reached getOption("max.print") -- omitted 744 entries ]
length(unique(protnames)) # 1744 proteins present in all 10 samples
[1] 1744
# Aggregating peptide abundance values into protein abundance values
qnt.prot <- combineFeatures(qnt.vsn, groupBy = protnames, fun = "median")
Combined 12077 features into 12077 using median
dim(qnt.prot)
[1] 1744 10
head(exprs(qnt.prot))
150mJ.1 150mJ.2 150mJ.3 275mJ.1 275mJ.2 275mJ.3 400mJ.1 400mJ.2 400mJ.3 Pool.1
A0A024QZP7 3.816467 3.620175 3.927225 4.052191 3.357771 3.310799 3.357685 3.268323 3.294870 3.651255
A0A024R216 7.599068 8.060989 6.553413 5.027294 7.777270 7.677502 7.354644 7.367304 7.238476 7.079061
A0A024R4E5 6.329437 5.803502 6.730733 6.286203 6.320219 6.172760 5.830347 5.964010 5.750300 6.287400
A0A024R644 5.396222 5.055941 4.600265 4.333401 4.729008 4.757164 4.156337 4.452842 4.393996 4.887391
A0A075B716 6.043305 5.677170 5.677677 5.247188 5.606109 5.426848 5.970442 5.801846 5.696659 5.619219
A0A087WSV8 4.562525 5.022626 4.362861 4.670520 4.962158 5.500905 6.268788 5.923221 5.913728 4.863713
head(exprs(qnt.vsn))
150mJ.1 150mJ.2 150mJ.3 275mJ.1 275mJ.2 275mJ.3 400mJ.1 400mJ.2 400mJ.3 Pool.1
2 2.768493 1.966696 6.596663 6.509413 2.401781 6.540405 1.903026 6.520467 6.507783 6.511016
3 4.166554 4.035286 5.620443 6.395213 4.589417 5.245986 6.144995 5.496511 5.716597 5.117277
4 5.200775 5.765512 5.219895 5.058563 5.142504 5.314060 4.817990 4.729333 4.899958 4.730216
5 5.892083 5.608025 5.671977 5.279402 5.237583 4.981204 5.499543 5.309686 5.272549 5.135761
6 5.455946 5.232610 4.995591 5.433890 5.079733 4.986121 5.175707 5.402227 5.270593 5.470324
7 2.355212 1.760496 4.276767 3.345973 1.786217 1.852261 2.003715 2.348329 1.819560 2.025044
# Basic plots of protein data across samples
.plot(qnt.prot,ttl="Aggregated-proteins")
plot(hclust(dist(exprs(t(qnt.prot)))))
# Looking at sample correlations
cor.prot = cor(exprs(qnt.prot))
heatmap(cor.prot,cex.main = 0.8)
dissimilarity <- 1 - cor.prot
distance <- as.dist(dissimilarity)
plot(hclust(distance))
pairs(x = exprs(qnt.prot), upper.panel=NULL, pch=20)
# Using the "duplicatecorrelation" function in limma to test correlation between technical replicates
# If Rayner's ordering is correct, then the samples are
biolrep.rq <- c(1,1,1,2,2,2,3,3,3,4)
# If there is a swap between Pool.1 and 275mj.rep1, then it should be
biolrep.swap <- c(1,1,1,4,2,2,3,3,3,2)
# If it is indeed a swap, then the correlation should increase when corrected. It does!
rq.cor = duplicateCorrelation(qnt.prot,ndups=1,block=biolrep.rq)$consensus.correlation # 0.203
swap.cor = duplicateCorrelation(qnt.prot,ndups=1,block=biolrep.swap)$consensus.correlation # 0.564
rq.cor
[1] 0.2031868
swap.cor
[1] 0.5641893
cor.prot
150mJ.1 150mJ.2 150mJ.3 275mJ.1 275mJ.2 275mJ.3 400mJ.1 400mJ.2 400mJ.3 Pool.1
150mJ.1 1.0000000 0.9335052 0.7911591 0.4153498 0.9280369 0.8853188 0.6821358 0.7285274 0.6684451 0.8458091
150mJ.2 0.9335052 1.0000000 0.7551314 0.4370340 0.9500877 0.9236256 0.7780666 0.8027901 0.7780055 0.8816818
150mJ.3 0.7911591 0.7551314 1.0000000 0.5989892 0.7830856 0.7672976 0.6600698 0.6912729 0.6334646 0.7386629
275mJ.1 0.4153498 0.4370340 0.5989892 1.0000000 0.5515125 0.5783829 0.6087879 0.6728370 0.6634102 0.6850855
275mJ.2 0.9280369 0.9500877 0.7830856 0.5515125 1.0000000 0.9663206 0.8269695 0.8625385 0.8276158 0.9248869
275mJ.3 0.8853188 0.9236256 0.7672976 0.5783829 0.9663206 1.0000000 0.8494414 0.8778792 0.8443968 0.9432591
400mJ.1 0.6821358 0.7780666 0.6600698 0.6087879 0.8269695 0.8494414 1.0000000 0.9486792 0.9224763 0.8623312
400mJ.2 0.7285274 0.8027901 0.6912729 0.6728370 0.8625385 0.8778792 0.9486792 1.0000000 0.9264799 0.9100994
400mJ.3 0.6684451 0.7780055 0.6334646 0.6634102 0.8276158 0.8443968 0.9224763 0.9264799 1.0000000 0.8713971
Pool.1 0.8458091 0.8816818 0.7386629 0.6850855 0.9248869 0.9432591 0.8623312 0.9100994 0.8713971 1.0000000
#duplicateCorrelation(qnt.prot[,1:9],ndups=1,block=c(1,1,1,2,2,2,3,3,3))$consensus.correlation # 0.23
#duplicateCorrelation(qnt.prot[,c(1:3,5:10)],ndups=1,block=c(1,1,1,2,2,3,3,3,2))$consensus.correlation # 0.37
We have merged the peptides into proteins and are looking here at the correlations within replicates of UV dosage. Replicate 3 of 150mJ dosage is a bit off from the other two while replicate 1 of 275mJ sits by itself allowing the “Pool” sample to cluster with the other 275mJ samples.
I thought that 275mJ.3 and Pool might have been swapped. So the various plots were done to prove whether or not this was true. The “duplicateCorrelation” function yields a much higher correlation when we assume that 275mJ.3 is swapped with Pool.1 than if we didn’t.
Looking at th correlation values, 275.mJ vs Pool is more correlated than 150mJ vs Pool and 400mJ vs Pool. This could be becuase there was more material from sample 275mJ going into the pool. Rayner used the same volume of each of the 9 samples rather than same amount in the pool. Hence the higher correlation (well at least it is my best guess).
#------------------
# Now for some PCAs
# We are trying to look at the odd samples and figure out why they are odd.
#------------------
# Across all 10 samples
prot.pca = prcomp(t(exprs(qnt.prot)),scale=T)
j <- ggbiplot(prot.pca, var.axes=F, groups = factor(c(1,1,1,2,2,2,3,3,3,4)), circle = T,obs.scale=1,labels=rownames(prot.pca$x))
print(j)
# Excluding the Pool.1 sample
prot.pca.rq = prcomp(t(exprs(qnt.prot[,c(1:9)])),scale=T)
g <- ggbiplot(prot.pca.rq, var.axes=F, groups = factor(c(1,1,1,2,2,2,3,3,3)), circle = T,obs.scale=1,labels=rownames(prot.pca.rq$x))
g <- g+labs(colour = "UV Dosage")
print(g)
# Assuming Pool.1 is actually 275mJ.rep1
prot.pca.swap = prcomp(t(exprs(qnt.prot[,c(1:3,5:10)])),scale=T)
h <- ggbiplot(prot.pca.swap, var.axes=F, groups = factor(c(1,1,1,2,2,3,3,3,2)), circle = T,obs.scale=1,labels=rownames(prot.pca.swap$x))
print(h)
# Removing both problem samples
prot.pca.no.dud = prcomp(t(exprs(qnt.prot[,c(1:2,5:10)])),scale=T)
k <- ggbiplot(prot.pca.no.dud, var.axes=F, groups = factor(c(1,1,2,2,3,3,3,4)), circle = T,obs.scale=1,labels=rownames(prot.pca.no.dud$x))
print(k)
# To finish off, some boxplots....
pData(qnt.prot) = pData(impute.res)
boxplot(exprs(qnt.prot),las=2)
I’ve been looking to explain what I see. It seems like the most obvious answer would be that Pool.1 and 275mJ.1 are swapped. However, upon speaking with Rayner, this did not happen. He sets up the tubes in a row and adds the labels in order so the last sample was the pool and the 4th was 275mJ.1. What Rayner did say was that the tube with 150mJ.3 was dropped and sample had a little shake/tumble which might have affected its quality. I don’t know at which stage of the protocol this happened but it did. This would explain the separation of 150mJ.3 from the other two 150mJs. However, the 150mJ triplicate still cluster together on the hclust plots. With sample 275mJ.1, Rayner remembers it being odd but we don’t know why or how.
An alternate explanation is that sample 275mJ.1 did not elute(?) out properly while it was being prepared but it did by the time the pool was made. The pool was made of equal volumes of all 9 samples rather than equal protein amounts so there will be a higher representation of more abundant samples and lower of less abundant samples.
Have gone back to look at the “cor.prot” matrix and as Tom suspected, the Pool.1 correlates slightly better with non-dodgey 275mJ (corr = 0.94) than with 400mJ (0.88) and non-dodgey 150mJ (0.87) but not significantly more, hence the cluster plot looks like it does. The two dogey samples 150mJ.3 and 275mJ.1 have been left out of the average correlation calculation above.
Would like to take a picture of the table of final concentrations of samples that went into the pool and put it in the pData dataframe so we have it for furture reference. Rayner is re-doing experiment.
The boxplots seem to indicate that the two dodgey samples are missing proteins that are lowly expressed. Maybe they were too low to be detected by the mass spec ?
# -------------------------------------------------------------------------------------------------------------------------------------
# Step 6a: Creating a background list of proteins for U20S to be used for functional enrichment
# Using GO terms, KEGG pathways and Interpro domains for function of proteins detected by UV crosslinking and Trizol-enrichment
# All data are based on Trizol-4-step and 400mJ of UV exposure for 2 mins (?)
# -------------------------------------------------------------------------------------------------------------------------------------
#==========================================================================================================================================
# Using U2OS protein list from Geiger et al as the background list. This consists of maximal list of proteins expressed in U20S cell lines
# ~ 7500 proteins. We are however reading in the peptides and aggregating them using MSnbase as we did with the UV dosage data
#==========================================================================================================================================
# Read data from Tom's list of Geiger proteins
# Tom took the Supplementary data from paper and re-annotated it with master proteins using his script.
# The script gets focuses on using Swissprot IDs rather than both Swissprot and Trembl.
# It also marks which proteins are "crap" or "contaminants"
# Finally, it provides a measure of which peptides could be mapped to a unique protein. We use this as a filter for downstream analyses
u2os <- read.delim("Input/Geiger_et_al_2012_U2OS_peptides_plus_master.txt",sep="\t",header=T)
head(u2os)
dim(u2os) # 68621 21
[1] 68621 21
# Filter data - keep peptides with unique master proteins, those which are not "crap" and those that are not missing a master protein
u2os.filt1 = u2os[which(u2os$unique == 1 & u2os$master_protein != "" & u2os$crap_protein != 1),]
# Keep only those columnns with data of interest
# Geiger et al produced a triplicate MS dataset for the U2OS cell line
u2os.filt2 = u2os.filt1[,c(5:6,17,11:16,18:19)]
head(u2os.filt2)
# Checking for peptide loss. We loose ~ 400 peptides out of nearly 70,000 so not too concerned
dim(u2os) # 68621 21
[1] 68621 21
dim(u2os.filt1) # 64967 21
[1] 64967 21
dim(u2os.filt2) # 64967 11
[1] 64967 11
# Create an MSnSet object of the background list
u2os.samp = data.frame(sample=c("Intensity.U2OS_1","Intensity.U2OS_2","Intensity.U2OS_3"),rep=c(1,2,3))
rownames(u2os.samp) = u2os.samp$sample
res.u2os <- MSnSet(exprs = as.matrix(u2os.filt2[,c(5:7)]),fData=u2os.filt2[,-c(5:7)],pData = u2os.samp)
# Impute missing data in the background list from Geiger et al.
# Haven't drawn any plots for this. Can do it at a later date
impute.u2os <- impute(res.u2os,method = "knn")
28533 rows with more than 50 % entries missing;
mean imputation used for these rows
Cluster size 36434 broken into 28537 7897
Cluster size 28537 broken into 7812 20725
Cluster size 7812 broken into 4633 3179
Cluster size 4633 broken into 3045 1588
Cluster size 3045 broken into 1231 1814
Done cluster 1231
Cluster size 1814 broken into 860 954
Done cluster 860
Done cluster 954
Done cluster 1814
Done cluster 3045
Cluster size 1588 broken into 758 830
Done cluster 758
Done cluster 830
Done cluster 1588
Done cluster 4633
Cluster size 3179 broken into 1044 2135
Done cluster 1044
Cluster size 2135 broken into 1220 915
Done cluster 1220
Done cluster 915
Done cluster 2135
Done cluster 3179
Done cluster 7812
Cluster size 20725 broken into 18245 2480
Cluster size 18245 broken into 9841 8404
Cluster size 9841 broken into 3640 6201
Cluster size 3640 broken into 1877 1763
Cluster size 1877 broken into 788 1089
Done cluster 788
Done cluster 1089
Done cluster 1877
Cluster size 1763 broken into 775 988
Done cluster 775
Done cluster 988
Done cluster 1763
Done cluster 3640
Cluster size 6201 broken into 3330 2871
Cluster size 3330 broken into 2482 848
Cluster size 2482 broken into 1250 1232
Done cluster 1250
Done cluster 1232
Done cluster 2482
Done cluster 848
Done cluster 3330
Cluster size 2871 broken into 1561 1310
Cluster size 1561 broken into 350 1211
Done cluster 350
Done cluster 1211
Done cluster 1561
Done cluster 1310
Done cluster 2871
Done cluster 6201
Done cluster 9841
Cluster size 8404 broken into 5706 2698
Cluster size 5706 broken into 2228 3478
Cluster size 2228 broken into 1013 1215
Done cluster 1013
Done cluster 1215
Done cluster 2228
Cluster size 3478 broken into 1269 2209
Done cluster 1269
Cluster size 2209 broken into 1207 1002
Done cluster 1207
Done cluster 1002
Done cluster 2209
Done cluster 3478
Done cluster 5706
Cluster size 2698 broken into 1450 1248
Done cluster 1450
Done cluster 1248
Done cluster 2698
Done cluster 8404
Done cluster 18245
Cluster size 2480 broken into 1298 1182
Done cluster 1298
Done cluster 1182
Done cluster 2480
Done cluster 20725
Done cluster 28537
Cluster size 7897 broken into 4366 3531
Cluster size 4366 broken into 1923 2443
Cluster size 1923 broken into 1388 535
Done cluster 1388
Done cluster 535
Done cluster 1923
Cluster size 2443 broken into 1369 1074
Done cluster 1369
Done cluster 1074
Done cluster 2443
Done cluster 4366
Cluster size 3531 broken into 1164 2367
Done cluster 1164
Cluster size 2367 broken into 1331 1036
Done cluster 1331
Done cluster 1036
Done cluster 2367
Done cluster 3531
Done cluster 7897
# Aggregate Geiger et al data from peptides to proteins
# 64967 peptides are aggregated to 7507 proteins
agg.u2os = combineFeatures(impute.u2os,groupBy = fData(impute.u2os)$master_protein,cv = T,fun = "median")
Combined 64967 features into 64967 using median
dim(agg.u2os)
[1] 7507 3
There are 7507 proteins in the U20S cell line, across 3 replicates, based on the Geiger et al., dataset. The next step is to annotate this gene list with the various functional categories that we want to perform enrichment analysis for.
head(bg.list.bm)
dim(bg.list.bm)
[1] 183612 6
Now that we have mapped genes to annotations, time for some enrichment analysis.
The protein “universe” used here is the list of all proteins discovered in Geiger et al., 2012. The list of peptides was mapped to proteins by Tom and annotated with master protein identifiers, crap proteins and uniqueness. This was used as it is the most comprehensive list of U2OS proteins mapped using mass spec to date. After filtering, there were ~7500 proteins generated in the study.
The “DE list” or “enriched” list of proteins are those that were expressed in the UV dosage experiment across all 9 samples. This yields 1744 proteins of which ~1500 could be mapped to GO identifiers. I tried doing this mapping with both ‘bioMart’ and ‘org.Hs.db’ databases available as packages within R. The former yielded more mappings so this is what I proceeded with.
For GO enrichment, I used the ‘goseq’ package which accounts for any bias (here abundance of protein) and then checks for enrichment. ‘goseq’ yielded 102 terms of which 45 were BP, 38 were CC and 19 were MF terms. Of the BP (Biological Process) terms, more than half were involved in RNA processing and translation.
As an alternative, I used clusterProfiler’s “enrichGO” function which does not take into account any biases. This yielded ~438 enriched terms of which 261 were BP, 64 were MF and 113 were CC. Of the biological process terms,63 were related to RNA processing.
head(qm)
DataFrame with 6 rows and 14 columns
_id X_score entrezgene
<character> <numeric> <integer>
1 55236 20.92187 55236
2 92312 21.71693 92312
3 285590 20.91144 285590
4 23019 20.91486 23019
5 23347 20.29458 23347
6 100130361 21.72836 100130361
interpro
<list>
1 c("Ubiquitin-activating enzyme E1", "Ubiquitin-activating enzyme E1, C-terminal", "Ubiquitin/SUMO-activating enzyme E1"),c("IPR018075", "IPR018965", "IPR000011"),c("UBQ-activ_enz_E1", "Ub-activating_enz_E1_C", "UBQ/SUMO-activ_enz_E1-like")
2 c("Zinc finger, RING/FYVE/PHD-type", "K Homology domain", "K Homology domain, type 1"),c("IPR013083", "IPR004087", "IPR004088"),c("Znf_RING/FYVE/PHD", "KH_dom", "KH_dom_type_1")
3 c("SH3 and PX domain-containing protein 2B", "SH3 domain", "Variant SH3 domain"),c("IPR030512", "IPR001452", "IPR011511"),c("SH3PXD2B", "SH3_domain", "SH3_2")
4 c("CCR4-Not complex component, Not1, C-terminal", "CCR4-Not complex, Not1 subunit, domain of unknown function DUF3819", "CCR4-NOT transcription complex subunit 1, HEAT repeat"),c("IPR007196", "IPR024557", "IPR032194"),c("CCR4-Not_Not1_C", "CCR4-Not_Not1su_DUF3819", "CNOT1_HEAT")
5 c("SMCs flexible hinge", "Histidine kinase-like ATPase, C-terminal domain"),c("IPR010935", "IPR003594"),c("SMC_hinge", "HATPase_C")
6 Protein of unknown function DUF4641,IPR027822,DUF4641
symbol query ensembl.gene ensembl.protein ensembl.transcript
<character> <character> <character> <list> <list>
1 UBA6 A0AVT1 ENSG00000033178 ENSP00000313454,ENSP00000399234,ENSP00000421984,... ENST00000322244,ENST00000420827,ENST00000429659,...
2 MEX3A A1L020 ENSG00000254726 ENSP00000432845 ENST00000442784,ENST00000532414
3 SH3PXD2B A1X283 ENSG00000174705 ENSP00000309714,ENSP00000428076,ENSP00000430890,... ENST00000311601,ENST00000518522,ENST00000519643,...
4 CNOT1 A5YKK6 ENSG00000125107 ENSP00000320949,ENSP00000413113,ENSP00000454377,... ENST00000317147,ENST00000441024,ENST00000562046,...
5 SMCHD1 A6NHR9 ENSG00000101596 ENSP00000326603,ENSP00000462050,ENSP00000463036,... ENST00000320876,ENST00000577300,ENST00000577880,...
6 CXorf49 A8MYA2 ENSG00000215115 ENSP00000480355 ENST00000452468,ENST00000535490
ensembl.translation
<list>
1 c("ENSP00000313454", "ENSP00000425091", "ENSP00000421984"),c("ENST00000322244", "ENST00000514261", "ENST00000505673")
2 ENSP00000432845,ENST00000532414
3 c("ENSP00000490082", "ENSP00000430890", "ENSP00000428076"),c("ENST00000636523", "ENST00000519643", "ENST00000518522")
4 c("ENSP00000320949", "ENSP00000455635", "ENSP00000456649"),c("ENST00000317147", "ENST00000569240", "ENST00000567188")
5 c("ENSP00000326603", "ENSP00000462050", "ENSP00000463049"),c("ENST00000320876", "ENST00000585229", "ENST00000577880")
6 ENSP00000480355,ENST00000535490
go.BP
<list>
1 c("IMP", "IBA", "IEA"),c("GO:0006511", "GO:0006974", "GO:0007612"),c(17889673, NA, NA),c("ubiquitin-dependent protein catabolic process", "cellular response to DNA damage stimulus", "learning"),...
2
3 c("IMP", "IMP", "IEA"),c("GO:0001501", "GO:0001654", "GO:0002051"),c(20137777, 20137777, NA),c("skeletal system development", "eye development", "osteoblast fate commitment"),...
4 c("IDA", "TAS", "IEA"),c("GO:0000122", "GO:0000289", "GO:0006351"),list(16778766, NULL, NULL),c("negative regulation of transcription from RNA polymerase II promoter", "nuclear-transcribed mRNA poly(A) tail shortening", "transcription, DNA-templated"),...
5 c("IMP", "IEA", "IEA"),c("GO:0043584", "GO:0051276", "GO:0060821"),list(c(28067909, 28067911), NULL, NULL),c("nose development", "chromosome organization", "inactivation of X chromosome by DNA methylation"),...
6
go.CC
<list>
1 c("IBA", "IDA", "TAS"),c("GO:0005737", "GO:0005737", "GO:0005829"),c("cytoplasm", "cytoplasm", "cytosol")
2 c("IEA", "IEA", "IDA"),c("GO:0000932", "GO:0005634", "GO:0005829"),c("P-body", "nucleus", "cytosol")
3 c("IBA", "ISS", "ISS"),c("GO:0002102", "GO:0002102", "GO:0005737"),c("podosome", "podosome", "cytoplasm")
4 c("IDA", "ISS", "IDA"),c("GO:0000932", "GO:0000932", "GO:0005615"),c(21976065, NA, 22664934),c("P-body", "P-body", "extracellular space"),...
5 c("IDA", "IEA"),c("GO:0000784", "GO:0001740"),c(24270157, NA),c("colocalizes_with", NA),c("nuclear chromosome, telomeric region", "Barr body"),...
6
go.MF
<list>
1 c("IBA", "IPI", "IEA"),c("GO:0004839", "GO:0005515", "GO:0005524"),c("ubiquitin activating enzyme activity", "protein binding", "ATP binding"),list(NULL, c(17889673, 25416956, 25422469), NULL),...
2 c("IDA", "IEA"),c("GO:0003723", "GO:0046872"),c(22681889, NA),c("RNA binding", "metal ion binding"),...
3 c("IPI", "IBA", "ISS"),c("GO:0005515", "GO:0010314", "GO:0010314"),list(c(19755710, 20609497), NULL, NULL),c("protein binding", "phosphatidylinositol-5-phosphate binding", "phosphatidylinositol-5-phosphate binding"),...
4 c("IDA", "IDA", "IPI"),c("GO:0003723", "GO:0004535", "GO:0005515"),list(c(22658674, 22681889), 21976065, c(10637334, 16778766, 17452450, 18377426, 21278420, 21981923, 22977175, 23232451, 23463101, 23644599, 24736845, 24768540, 25038453, 26496610)),c("RNA binding", "poly(A)-specific ribonuclease activity", "protein binding"),c(NA, "contributes_to", NA),...
5 c("IEA", "ISS"),c("GO:0005524", "GO:0016887"),c("ATP binding", "ATPase activity")
6
domains
<character>
1 UBQ-activ_enz_E1; Ub-activating_enz_E1_C; UBQ/SUMO-activ_enz_E1-like; ThiF_NAD_FAD-bd; E1_FCCH; UBA_E1_Cys; E1_4HB; NAD(P)-bd_dom
2 Znf_RING/FYVE/PHD; KH_dom; KH_dom_type_1; Znf_RING
3 SH3PXD2B; SH3_domain; SH3_2; Phox
4 CCR4-Not_Not1_C; CCR4-Not_Not1su_DUF3819; CNOT1_HEAT; CNOT1_CAF1_bind; CNOT1_TTP_bind
5 SMC_hinge; HATPase_C
6 DUF4641
Would like to map each of the proteins to PFAM/SMART domains to see if they are indeed RNA binding proteins…… Have mapped RNA BP domains to both Geiger and UV.dosage. Need to look at the genes involved for which we have further evidence that they are indeed RBPs
The domain “Nucleotide-bd_a/b_plait” is present in almost all (16/18) heterogeneous nuclear ribonucleoproteins whose main task is to move mRNA out of the nucleus. This domain has an interpro entry IPR012677 which is no longer valid. This domain is also present in nucleolin and EWSR1. This might be the “RGG-box” domain eluded to in Burd and Dreyfuss, 1994. Excitingly, “Nucleotide-bd_a/b_plait” is a top domain the goseq analysis. I will substitute hnRNP searches with this term.
“Ig-like”/“Ig_sub” from Tom’s notes are indicative of glycoproteins which might be RNA-binding. There are 58 Ig term related proteins in the UV-dosage experiments of which only 2 have RNA-binding domains indicating only a small fraction have RNA binding capapbilities but majority of them are involved in other functions.
#--------------------------------------------------------------------
# 07 : Mapping oligodT data to domains and looking for enrichment
#--------------------------------------------------------------------
oligo.cl.in.nc = read.table("Input/Leicester-oligodT-CL-prot.txt",header=T,sep="\t") # 328 gene symbols from Rayner's file
oligo.nc = read.table("Input/Leicester-oligodT-NC-prot.txt",header=T,sep="\t") # 73 gene symbols from Rayner's file
dim(oligo.cl.in.nc)
[1] 328 5
dim(oligo.nc)
[1] 73 5
# Present in non-crosslinked and hence will be removed
oligo.cl = oligo.cl.in.nc[-which(oligo.cl.in.nc$GeneName %in% oligo.nc$GeneName),]
dim(oligo.cl) # 258 are present only in the crosslinked and not in non-crosslinked samples
[1] 258 5
# Which genes are present in both cross and non-crosslinked samples
cl.and.nc.prot = oligo.cl.in.nc[which(oligo.cl.in.nc$GeneName %in% oligo.nc$GeneName),]
dim(cl.and.nc.prot) # 70 5
[1] 70 5
# Mapping gene symbols to proteins
# One transcript can be spliced alternatively to produce multiple proteins so we expect more when we map to uniprot
# We now have 972 lines mapping the 328 genes to proteins/domains
# For crosslinked samples
oligo.cl.qm = queryMany(unique(oligo.cl$GeneName),scopes="symbol",fields=c("id","entrezgene","name","uniprot","interpro"))
Finished
Pass returnall=TRUE to return lists of duplicate or missing query terms.
oligo.cl.qm$domains = sapply(sapply(oligo.cl.qm$interpro,"[[",3),function(x) paste(x,collapse="; "))
oligo.cl.qm = as.data.frame(oligo.cl.qm)
dim(oligo.cl.qm)
[1] 762 10
# For non-crosslinked samples
oligo.nc.qm = queryMany(unique(oligo.nc$GeneName),scopes="symbol",fields=c("id","entrezgene","name","uniprot","interpro"))
Finished
Pass returnall=TRUE to return lists of duplicate or missing query terms.
oligo.nc.qm$domains = sapply(sapply(oligo.nc.qm$interpro,"[[",3),function(x) paste(x,collapse="; "))
oligo.nc.qm = as.data.frame(oligo.nc.qm)
dim(oligo.nc.qm)
[1] 224 10
In this first step, we are reading in oligodT data from one experiment with both crosslinked(cl) and non-crosslinked(nc) samples. We remove proteins from the ‘cl’ which were also present in ‘nc’ as we cannot comment on enrichment.
In addition, we annotate proteins given by their GeneNames/HGNC Symbols to entrezgene id, uniprot id, gene description, interpro domains and trembl ids. Because one gene can be alternatively spliced to multiple proteins, we have a one-to-many mapping and 258 crosslinked genes give us 762 crosslinked proteins. Similarly, 73 non-crosslinked genes give us 224 non-crosslinked proteins
# -----------------------------------
# Some filtering of oligodT proteins
#------------------------------------
# Filtering for those with Swissprot ids only
#--------------------------------------------
# 587/762 cross-linked proteins are high confidence and are not missing SwissProt ids
oligo.cl.sp = oligo.cl.qm[which(!is.na(oligo.cl.qm$uniprot.Swiss.Prot)),]
length(unique(oligo.cl.sp$uniprot.Swiss.Prot))
[1] 587
# 176/224 non-crosslined proteins are high confidence
oligo.nc.sp = oligo.nc.qm[which(!is.na(oligo.nc.qm$uniprot.Swiss.Prot)),]
length(unique(oligo.nc.sp$uniprot.Swiss.Prot))
[1] 176
# Remove mass spec contaminant proteins as we did with Trizol data
#--------------------------------------------------------------------
oligo.cl.no.contam = oligo.cl.sp[-which(oligo.cl.sp$uniprot.Swiss.Prot %in% contam$Protein.Group.Accessions),]
length(unique(oligo.cl.no.contam$uniprot.Swiss.Prot)) # 747 so we removed 5 contaminants ALB, KRT2, KRT10, RPS27A, KRT1
[1] 586
oligo.nc.no.contam = oligo.nc.sp[-which(oligo.nc.sp$uniprot.Swiss.Prot %in% contam$Protein.Group.Accessions),]
length(unique(oligo.nc.no.contam$uniprot.Swiss.Prot)) # 172 so we removed 4 contaminants KRT2, KRT10, RPS27A, KRT1
[1] 172
We have two datasets now - oligo.cl.no.contam made of 586 unique proteins and oligo.nc.no.contam made of 172 proteins. Next step is to set up goseq. In the initial run, I forgot that the bias data was from U2OS Geiger rather than our own expression so will implement this.
table(all.genes.comp)
all.genes.comp
0 1
7469 38
Interestingly, the oligodT data seems a lot “cleaner” than trizol dataset. By this I mean, the top domains are most definitely all known RNA-binding domains based on literature looking at RBPs (Lunde 2007, Burd 1994). In the Trizol data we get “Ig-like” domains as one of the top hits which we don’t see at all in the oligodT data.
Between crosslinked and non-crosslinked samples, we still see some overlap in that we are getting RNA-binding domains and proteins in non-crosslinked samples but the number of such proteins in a LOT lower in non-crosslinked than in crosslinked samples.
#------------------------------------------------------------------------------------------------------------------------
# Step 10:Performing enrichment analysis using 'clusterProfiler' and 'goseq' packages
# In the first instance we will use GO terms and KEGG terms for enrichment analysis
#------------------------------------------------------------------------------------------------------------------------
geiger = as.character(unique(fData(agg.u2os)$master_protein))
beck = read.table("Input/Beck2011_n5781.txt")
beck = as.character(beck$V1)
lundberg.ens = read.table("Input/Lundberg2010_n5480.txt")
lundberg = unique(bitr(as.character(lundberg.ens$V1),fromType="ENSEMBL", toType="UNIPROT", OrgDb="org.Hs.eg.db")$UNIPROT)
length(lundberg)
library(venn)
library(gplots)
library(limma)
?vennCounts
?vennDiagram
venn.u2os = venn(list(Geiger2012=geiger,Beck2011=beck,Lundberg2010=lundberg))
Small exercise on how much Geiger et al, Beck et al, and Lundberg et al., protein sets from Mass Spectrometery experiments overlap. With Lundberg et al., I had to map Ensembl IDs to UNIPROT and then do the comparison which caused a one-to-many mapping. The excess 4941 only found in Lundberg et al is almost exclusively due to the mapping of one Ensembl ID to many UNIPROT IDs. Geiger et al, being the most recent study does claim to have maximal protein mapping for U2OS. Beck et al.,
Should we use just the 4145 that overlaps as our high confident set or focus on Geiger as it the most recent ?
#-------------------
# goseq with KEGG
#-------------------
# Restricted to only those proteins that had KEGG mappings in bioMart so not very extensive.
# Better mapping and analysis with clusterProfiler....
#------------------------------
# Function : mapPathwayToName
#------------------------------
mapPathwayToName <- function(organism) {
KEGG_PATHWAY_LIST_BASE <- "http://rest.kegg.jp/list/pathway/"
pathway_list_REST_url <- paste(KEGG_PATHWAY_LIST_BASE, organism, sep="")
pathway_id_name <- data.frame()
for (line in readLines(pathway_list_REST_url)) {
tmp <- strsplit(line, "\t")[[1]]
pathway_id <- strsplit(tmp[1], organism)[[1]][2]
pathway_name <- tmp[2]
pathway_name <- strsplit(pathway_name, "\\s+-\\s+")[[1]][1]
pathway_id_name[pathway_id, 1] = pathway_name
}
names(pathway_id_name) <- "pathway_name"
pathway_id_name
}
kegg.names = mapPathwayToName('hsa')
kegg.list = bg.list.bm[which(bg.list.bm$kegg_enzyme != ""),]
kegg.list$kegg = sapply(strsplit(kegg.list$kegg_enzyme,"\\+"),"[[",1)
# Check for GO enrichment given the bias data using KEGG mappings
kegg.all.genes = all.genes[which(names(all.genes) %in% kegg.list$uniprotswissprot)]
table(kegg.all.genes)
kegg.bias.df = bias.df[which(bias.df$UNIPROT %in% kegg.list$uniprotswissprot),]
# Go seq based on KEGG
#write.table(data.frame(all.genes),"All-genes-for-goseq.txt",sep="\t",row.names=T, quote=F)
kegg.ids = read.table("Input/All-genes-for-goseq-with-KEGG.txt",sep="\t",header=T)
KEGG.pwf = nullp(all.genes,'hg19','ensGene', bias.data=as.numeric(bias.df$protbias),plot.fit=TRUE)
KEGG.wall = goseq(KEGG.pwf,gene2cat = kegg.ids[,c(1,2)])
KEGG.wall$BH_over_represented_pvalue = p.adjust(KEGG.wall$over_represented_pvalue,method = "BH")
KEGG.wall$description = kegg.names[KEGG.wall$category,]
KEGG.wall = KEGG.wall[,c(1,7,4:5,2,6,3)]
KEGG.wall
prot.list = as.character(unique(prot.list.bm$entrezgene)) # n = 1516
bg.list = as.character(unique(bg.list.bm$entrezgene)) # n = 7477
ggo.bp <- enrichGO(gene = prot.list,universe = bg.list,OrgDb = org.Hs.eg.db,ont = "BP",qvalueCutoff = 0.05,readable = TRUE)
ggo.mf <- enrichGO(gene = prot.list,universe = bg.list,OrgDb = org.Hs.eg.db,ont = "MF",qvalueCutoff = 0.05,readable = TRUE)
ggo.cc <- enrichGO(gene = prot.list,universe = bg.list,OrgDb = org.Hs.eg.db,ont = "CC",qvalueCutoff = 0.05,readable = TRUE)
ggo = rbind(data.frame(ggo.bp,Ontology="BP"),data.frame(ggo.mf,Ontology="MF"),data.frame(ggo.cc,Ontology="CC"))
ggo = ggo[,c(1:2,10,3:4,7,5:6)]
ggo
dotplot(ggo.mf)
dotplot(ggo.bp)
dotplot(ggo.cc)
head(ggo,n=20)
# Doing a KEGG based enrichment for the enriched protein list
kegg.prot = enrichKEGG(gene = prot.list,universe = bg.list,qvalueCutoff = 0.05)
kegg.prot = data.frame(kegg.prot)
kegg.prot = kegg.prot[order(kegg.prot$Count,decreasing=T),]